3. Per session and per user analysis

Analysis of users.

Table of Contents

  1. Preparation

  2. Constants

  3. Functions

Preparation


In [ ]:
%run "../Functions/2.1 Sampling.ipynb"
print("3. Per session and per user analysis")

Constants


In [ ]:
perSessionRelevantColumns = ['sessionId', 'serverTime', 'section']

#reachEvents = rmdf1522[rmdf1522['type']=='reach'].loc[:,perSessionRelevantColumns]
#deathEvents = rmdf1522[rmdf1522['type']=='death'].loc[:,perSessionRelevantColumns]

timedSectionsIndex = checkpointArrayStr
timedSectionsReachedColumns = ['firstReached', 'firstCompletionDuration']
timedSectionsDeathsColumns = ['deathsCount']
eventSectionsCountColumns = ['section', 'count']
eventSectionsColumns = ['count']

Functions


In [ ]:
## Comparison between game and Google form performance

In [ ]:
# Returns a given session's checkpoints, the first server time at which they were reached, and completion time
def getCheckpointsCompletionTimes( sessionId, _rmDF, defaultTime=pd.Timedelta.max):
    reachEvents = _rmDF[_rmDF['type']=='reach'].loc[:,perSessionRelevantColumns]
    perSession = reachEvents[reachEvents['sessionId']==sessionId]
    perSession = perSession[perSession['section'].str.startswith('tutorial', na=False)]

    timedSections = pd.DataFrame(data=0, columns=timedSectionsReachedColumns,index=timedSectionsIndex)
    timedSections['firstReached'] = pd.Timestamp(0, tz='utc')
    timedSections['firstCompletionDuration'] = pd.Timedelta.max

    if(len(perSession) > 0):
        timedSections["firstReached"] = perSession.groupby("section").agg({ "serverTime": np.min })
        timedSections["firstCompletionDuration"] = timedSections["firstReached"].diff()

        if(timedSections.loc[tutorialStem + checkpointStem + "00","firstReached"] != pd.Timestamp(0, tz='utc')):
            timedSections.loc[tutorialStem + checkpointStem + "00","firstCompletionDuration"] = \
            pd.Timedelta(0)

    timedSections["firstReached"] = timedSections["firstReached"].fillna(pd.Timestamp(0, tz='utc'))
    timedSections["firstCompletionDuration"] = timedSections["firstCompletionDuration"].fillna(defaultTime)
    
    return timedSections

In [ ]:
# Returns a given user's checkpoints, the first server time at which they were reached, and completion time
def getCheckpointsCompletionTimesUser( userId, _rmDF, _sessionsList = []):
    # List of associated sessions
    if( len(_sessionsList) == 0):
        _sessionsList = getUserSessions(_rmDF, userId)

    # Call getCheckpointsCompletionTimes on all sessions associated with user,
    # then merge by taking oldest checkpoint completion
    _timedSections = pd.DataFrame(data=0, columns=timedSectionsReachedColumns,index=timedSectionsIndex)
    _timedSections["firstReached"] = pd.Timestamp(0, tz='utc')
    _timedSections["firstCompletionDuration"] = pd.Timedelta.max

    # merge
    # for each checkpoint reached, update if necessary
    for _sessionId in _sessionsList:        
        _thisSessionTimes = getCheckpointsCompletionTimes( _sessionId, _rmDF = _rmDF )

        for _checkpointName in _thisSessionTimes.index:
            if ((_thisSessionTimes.loc[_checkpointName, 'firstReached'] != pd.Timestamp(0, tz='utc'))
                and
                ((_timedSections.loc[_checkpointName, 'firstReached'] == pd.Timestamp(0, tz='utc'))
                or (_timedSections.loc[_checkpointName, 'firstReached'] > _thisSessionTimes.loc[_checkpointName, 'firstReached']))
               ):
                _timedSections.loc[_checkpointName, 'firstReached'] = _thisSessionTimes.loc[_checkpointName, 'firstReached']
                _timedSections.loc[_checkpointName, 'firstCompletionDuration'] = _thisSessionTimes.loc[_checkpointName, 'firstCompletionDuration']

    return _timedSections

In [ ]:
# Returns a given session's checkpoints and time spent on each checkpoint
def getCheckpointsTotalTimes( sessionId, _rmDF):
    
    # TODO FIXME better version:
    #  take type into account, especially events of survey opening
    #  take NaN sections into account
    # otherwise, game time is added without actual play
    
    sessionEvents = _rmDF[_rmDF["sessionId"] == sessionId]
    timedSectionnedEvents = sessionEvents[sessionEvents['section'].isin(timedSectionsIndex)]
    timedSectionnedEvents = timedSectionnedEvents.loc[:,["section","userTime"]]
    timedSectionnedEvents = timedSectionnedEvents.sort_values(by="userTime")

    totalTimes = pd.Series(index=timedSectionsIndex, data=pd.Timedelta(0))

    if (len(timedSectionnedEvents) > 0):
        #print("len(timedSectionnedEvents) > 0")
        position = 0
        section = timedSectionnedEvents.iloc[position, :]["section"]
        userTime = timedSectionnedEvents.iloc[position, :]["userTime"]

        #_progress = IntProgress(min=0, max=len(timedSectionnedEvents))
        #display(_progress)


        while position < len(timedSectionnedEvents):

            #_progress.value += 1
            #_progress.description = str(_progress.value) + "/" + str(len(timedSectionnedEvents))        

            if (section != timedSectionnedEvents.iloc[position, :]["section"]):
                deltaT = timedSectionnedEvents.iloc[position, :]["userTime"] - userTime
                #print("step" + str(_progress.value) + ": update section " + section + " by " + str(deltaT))        
                totalTimes[section] += deltaT
                section = timedSectionnedEvents.iloc[position, :]["section"]
                userTime = timedSectionnedEvents.iloc[position, :]["userTime"]

            position += 1

        deltaT = timedSectionnedEvents.iloc[position-1, :]["userTime"] - userTime
        #print("step" + str(_progress.value) + ": update section " + section + " by " + str(deltaT))        
        totalTimes[section] += deltaT 
    return totalTimes

In [ ]:
# Returns a given user's checkpoints, the first server time at which they were reached, and completion time
def getCheckpointsTotalTimesUser( userId, _rmDF, _sessionsList = []):
    # List of associated sessions
    if( len(_sessionsList) == 0):
        _sessionsList = getUserSessions(_rmDF, userId)

    totalTimes = pd.Series(index=timedSectionsIndex, data=pd.Timedelta(0))
    # Call getCheckpointsCompletionTimes on all sessions associated with user,
    # then merge by adding
    
#    _progress = IntProgress(min=0, max=len(_sessionsList))
#    display(_progress)
    
    for _sessionId in _sessionsList:        
        totalTimes += getCheckpointsTotalTimes(_sessionId, _rmDF)
#        _progress.value += 1

    return totalTimes

In [ ]:
def getPlayedTimeSessionMode(sessionEvents, mode, strictEvents=True, strictSection=True):
    
    sessionEvents = sessionEvents[sessionEvents['section'].str.startswith(mode, na=(not strictSection))]
    
    if strictEvents:
        sessionEvents = sessionEvents[~sessionEvents["type"].isin(noSectionEventCodes)]
        
    sessionTimes = sessionEvents['userTime']
    
    sessionTimes.index = sessionTimes.values
    
    daysSpent = set()
    totalSpentTime = pd.Timedelta(0)
    
    if(len(sessionTimes) > 0):
        sessionTimes = sessionTimes.groupby(pd.TimeGrouper('D')).agg({ "start": np.min, "end": np.max })

        daysSpent = set(sessionTimes.index)

        sessionTimes['played'] = sessionTimes['end'] - sessionTimes['start']
        totalSpentTime = sessionTimes['played'].sum()

    return {'daysSpent': daysSpent, 'totalSpentTime': totalSpentTime}

In [ ]:
# Returns a given session's total playtime and day count
def getPlayedTimeSession( sessionId, _rmDF):
    sessionEvents = _rmDF[_rmDF['sessionId']==sessionId]
    tutorialTime = getPlayedTimeSessionMode(sessionEvents, 'tutorial', strictSection=False)
    sandboxTime = getPlayedTimeSessionMode(sessionEvents, 'sandbox')
    return {'tutorial': tutorialTime, 'sandbox': sandboxTime}

In [ ]:
def mergePlayedTimes(a, b):
    result = a.copy()
    for gameMode in a:
        result[gameMode] = {
            'totalSpentTime': a[gameMode]['totalSpentTime'] + b[gameMode]['totalSpentTime'],
            'daysSpent': a[gameMode]['daysSpent'] | b[gameMode]['daysSpent'],
        }
    return result

In [ ]:
# Returns a given user's total playtime and day count
def getPlayedTimeUser( userId, _rmDF, _sessionsList = []):
    result = getPlayedTimeSession('', _rmDF = _rmDF)

    if(len(_sessionsList) == 0):
        _sessionsList = getUserSessions(_rmDF, userId)
    for session in _sessionsList:
        playedTimes = getPlayedTimeSession(session, _rmDF)
        result = mergePlayedTimes(result, playedTimes)

    return result

In [ ]:
# Returns a given session's checkpoints, and death count
def getDeaths( sessionId, _rmDF):
    deathEvents = _rmDF[_rmDF['type']=='death'].loc[:,perSessionRelevantColumns]
    perSession = deathEvents[deathEvents['sessionId']==sessionId]
    perSession = perSession[perSession['section'].str.startswith('tutorial', na=False)]
    deathsSections = perSession.groupby("section").size().reset_index(name='deathsCount')
    return deathsSections

In [ ]:
def getDeathsUser( userId, _rmDF):
    #print("getDeathsUser(" + str(userId) + ")")
    
    # List of associated sessions
    sessionsList = getUserSessions(_rmDF, userId)
    #print("sessionsList=" + str(sessionsList))
    
    # Call getDeaths on all sessions associated with user,
    # then merge by adding
    deathsSections = pd.DataFrame(0, columns=timedSectionsDeathsColumns,index=timedSectionsIndex)
    
    for sessionId in sessionsList:        
        #print("processing user " + str(userId) + " with session " + str(sessionId))
        deaths = getDeaths( sessionId )
        
        # merge
        # for each checkpoint reached, update if necessary
        for index in deaths.index:
            #print("index=" + str(index))
            checkpointName = deaths['section'][index]
            #print("checkpointName=" + str(checkpointName))
            #print("deaths['deathsCount']["+str(index)+"]=" + str(deaths['deathsCount'][index]))
            
            deathsSections['deathsCount'][checkpointName] = deathsSections['deathsCount'][checkpointName] + deaths['deathsCount'][index]
    
    return deathsSections

Craft events: equip, unequip, add, remove

event-column association

equip device = 'add' + customData.device

unequip device = 'remove' + customData.device

add brick = 'add' + customData.biobrick

remove brick = 'remove' + customData.biobrick

In [ ]:
# Static data
# craftEventsColumns = pd.DataFrame(
#    index=list(range(4)),
#    data={
#        'eventCode' : pd.Categorical(["equip","unequip","add","remove"]),
#        'eventType' : pd.Categorical(["add","remove","add","remove"]),
#        'column' : pd.Categorical(["customData.device","customData.device","customData.biobrick","customData.biobrick"]),
#    }
#)
#craftEventsColumns

In [ ]:
# Static data
craftEventCodes = list(["equip","unequip","add","remove"])
craftEventsColumns = pd.DataFrame(
    index=craftEventCodes,
    data={
        'eventType' : pd.Categorical(["add","remove","add","remove"]),
        'column' : pd.Categorical(["customData.device","customData.device","customData.biobrick","customData.biobrick"]),
    }
)

In [ ]:
# Returns a given session's checkpoints, and event count
# eventCode != RedMetrics' eventType
# eventCodes are craftEventsColumns' index
def getSectionsCraftEvents( eventCode, sessionId, _rmDF):
    #print("getSectionsCraftEvents(" + str(eventCode) + "," + str(sessionId) + ")")
    sectionsEvents = pd.DataFrame(0, columns=eventSectionsCountColumns, index=range(0))
    if eventCode in craftEventCodes:
        eventType = craftEventsColumns['eventType'][eventCode]
        events = _rmDF[_rmDF['type']==eventType]
        events = events[events[craftEventsColumns['column'][eventCode]].notnull()]
        #print("getSectionsCraftEvents(" + str(eventCode) + "," + str(sessionId) + "): #events=" + str(len(events)))
        #print("events=" + str(events.head()))
        events = events.loc[:,perSessionRelevantColumns]
        perSession = events[events['sessionId']==sessionId]
        perSession = perSession[perSession['section'].str.startswith('tutorial', na=False)]
        sectionsEvents = perSession.groupby("section").size().reset_index(name='count')
    else:
        print("incorrect event code '" + eventCode + "'")
    return sectionsEvents

In [ ]:
# eventCode != RedMetrics' eventType
# eventCodes are craftEventsColumns' index
def getUserSectionsCraftEvents( eventCode, userId, _rmDF, sessionsList = []):
    #print("getUserSectionsCraftEvents(" + str(eventCode) + "," + str(userId) + ")")
    
    # Call getSectionsEvents on all sessions associated with user,
    # then merge by adding
    userSectionsEvents = pd.DataFrame(0, columns=eventSectionsColumns,index=timedSectionsIndex)
    
    if eventCode in craftEventCodes:
        # List of associated sessions
        if(len(sessionsList) == 0):
            sessionsList = getUserSessions(_rmDF, userId)
        #print("sessionsList=" + str(sessionsList))
    
        for sessionId in sessionsList:
            sessionSectionsEvents = getSectionsCraftEvents( eventCode, sessionId )

            # merge
            # for each checkpoint reached, update if necessary
            for index in sessionSectionsEvents.index:
                checkpointName = sessionSectionsEvents['section'][index]
                userSectionsEvents['count'][checkpointName] = userSectionsEvents['count'][checkpointName] + sessionSectionsEvents['count'][index]
    else:
        print("incorrect event code '" + eventCode + "'")
    return userSectionsEvents

In [ ]:
# eventCode != RedMetrics' eventType
# eventCodes are craftEventsColumns' index
def getUserSectionsCraftEventsTotal( eventCode, userId, _rmDF, sessionsList = [] ):
    #print("getUserSectionsCraftEventsTotal(" + str(eventCode) + "," + str(userId) + ")")
    events = getUserSectionsCraftEvents( eventCode, userId, _rmDF, sessionsList )
    return events.values.sum()

craft, no section


In [ ]:
# eventCode != RedMetrics' eventType
# eventCodes are craftEventsColumns' index
def getUserCraftEventsTotal( eventCode, userId, _rmDF, sessionsList=[]):
    if(len(sessionsList) == 0):
        sessionsList = getUserSessions(_rmDF, userId)

    if eventCode in craftEventCodes:
        eventType = craftEventsColumns['eventType'][eventCode]
        events = _rmDF[_rmDF['type']==eventType]
        events = events[events[craftEventsColumns['column'][eventCode]].notnull()]
        perSession = events[events['sessionId'].isin(sessionsList)]
        return len(perSession)
    else:
        print("incorrect event code '" + eventCode + "'")
        return 0

Generic functions

Generic count


In [ ]:
# Returns a given session's checkpoints, and event count
def getSectionsEvents( eventType, sessionId, _rmDF):
    events = _rmDF[_rmDF['type']==eventType].loc[:,perSessionRelevantColumns]
    perSession = events[events['sessionId']==sessionId]
    perSession = perSession[perSession['section'].str.startswith('tutorial', na=False)]
    sectionsEvents = perSession.groupby("section").size().reset_index(name='count')
    return sectionsEvents

In [ ]:
def getUserSectionsEvents( eventType, userId, _rmDF, sessionsList=[]):
    
    # List of associated sessions
    if(len(sessionsList) == 0):
        sessionsList = getUserSessions(_rmDF, userId)
    
    # Call getSectionsEvents on all sessions associated with user,
    # then merge by adding
    userSectionsEvents = pd.DataFrame(0, columns=eventSectionsColumns,index=timedSectionsIndex)
    
    for sessionId in sessionsList:
        sessionSectionsEvents = getSectionsEvents( eventType, sessionId )
        
        # merge
        # for each checkpoint reached, update if necessary
        for index in sessionSectionsEvents.index:
            checkpointName = sessionSectionsEvents['section'][index]
            userSectionsEvents['count'][checkpointName] = userSectionsEvents['count'][checkpointName] + sessionSectionsEvents['count'][index]
    
    return userSectionsEvents

In [ ]:
def getUserSectionsEventsTotal( eventType, userId, _rmDF, sessionsList=[] ):
    events = getUserSectionsEvents( eventType, userId, _rmDF, sessionsList )
    return events.values.sum()

No section events


In [ ]:
def getUserEventsTotal( eventType, userId, _rmDF, sessionsList=[]):
    if(len(sessionsList) == 0):
        sessionsList = getUserSessions(_rmDF, userId)

    sessionEvents = _rmDF[_rmDF['type']==eventType]
    perSession = sessionEvents[sessionEvents['sessionId'].isin(sessionsList)]
    return len(perSession)

Other


In [ ]:
# Returns a given user's unique reached checkpoints
def getUserCheckpoints( userId, _rmDF):
    #print("getUserCheckpoints(" + str(userId) + ")")

    # List of associated sessions
    sessionsList = getUserSessions(_rmDF, userId)
    #print("sessionsList=" + str(sessionsList))

    # List all 'reach' events with those sessionIds.
    reachEvents = _rmDF[_rmDF['type']=='reach'].loc[:,perSessionRelevantColumns]
    perUser = reachEvents[reachEvents['sessionId'].isin(sessionsList)]
    perUser = perUser[perUser['section'].str.startswith('tutorial', na=False)]
    return pd.Series(perUser['section'].unique())


def getDiscrepancyGameGForm( userId ):
    gformNonVal = getNonValidatedCheckpoints(userId)
    gformVal = getValidatedCheckpoints(userId)
    gameVal = getUserCheckpoints(userId)
    
    #sorted, unique values in series1 that are not in series2
    #np.setdiff1d(series1.values, series2.values)
    
    #user has answered questions whose answer they haven't seen in the game
    gameNotEnough = pd.Series(np.setdiff1d(gformVal.values, gameVal.values))
    
    #user has not answered questions whose answer they have seen in the game
    gformNotEnough = []
    maxGameVal = ''
    if gameVal.values.size!=0:
        gameVal.values.max()
    for nonVal in gformNonVal.values:
        if nonVal >= maxGameVal:
            gformNotEnough.append(nonVal)    
    gformNotEnough = pd.Series(gformNotEnough)
    
    return (gameNotEnough, gformNotEnough)

In [ ]:
# Static data
noSectionEventCodes = list([
                            'configure',
                            'gotomooc',
                            'gotostudy',
                            'gotourl',
                            'restart',
                            'selectmenu',
                            'start',
                            'switch',
])

In [ ]:
simpleEvents = [
    'complete',
    'configure',
    'craft',
    'death',
    'equip',
    'unequip',
    'add',
    'remove',
    'gotomooc',
    'gotourl',
    'pickup',
    'reach',
    'restart',
    'selectmenu',
    'start',
    'switch',
    ]

# possible events: complete	configure	craft	death	equip	gotomooc	gotourl	pickup	reach	restart	selectmenu	start	switch	unequip

userDataVectorIndex = [#game
                       'sessionsCount',
                       ]

for temporality in answerTemporalities:
    userDataVectorIndex.append(scoreLabel + temporality)

userDataVectorIndex = np.concatenate( (userDataVectorIndex,
                                     simpleEvents))

In [ ]:
#allEvents = rmdf1522['type'].unique()
#allEvents = np.concatenate( simpleEvents, allEvents ).unique()
#allUserDataVectorIndex = np.concatenate( userDataVectorIndex, allEvents ).unique()

In [ ]:
overallScoreCriteria = ["scorepretest", "scoreposttest", "scoredelta",]

stemTimesCriteria = ["ch" + "{0:0=2d}".format(i) for i in range(0,15)]
completionTimesCriteria = [st + "completion" for st in stemTimesCriteria] + ["completionTime"]
totalTimesCriteria = [st + "total" for st in stemTimesCriteria] + ["totalTime"]

In [ ]:
# userId is RedMetrics user id
# _source is used as correction source, if we want to include answers to these questions
def getUserDataVector(userId, _rmDF, _gfDF, _source = correctAnswers, _printDebug = True, _binary=True):

    sessionsList = getUserSessions(_rmDF, userId)

    columnName = str(userId)

    data = pd.DataFrame(0, columns=[columnName], index=userDataVectorIndex)

    score = getScore(userId, _gfDF)
    for _temporality in score.columns:
        _score = score.loc[scoreLabel,_temporality]
        if(len(_score)>0):
            if(_temporality == answerTemporalities[0]):
                _score = _score[len(_score)-1]
            else:
                _score = _score[0]
        else:
            _score = np.nan
        data.loc[scoreLabel+_temporality, columnName] = _score
    data.loc[scoreLabel+"delta", columnName] = data.loc[scoreLabel+"posttest", columnName] - data.loc[scoreLabel+"pretest", columnName]

    data.loc['sessionsCount',columnName] = len(sessionsList)

    for eventName in simpleEvents:
        if eventName in craftEventCodes:
            data.loc[eventName,columnName] = getUserCraftEventsTotal(eventName, userId, _rmDF, sessionsList)
        else:
            data.loc[eventName,columnName] = getUserEventsTotal(eventName, userId, _rmDF, sessionsList)

    data.loc['maxChapter', columnName] = int(pd.Series(data = tutorialStem + checkpointStem + '00')\
                                             .append(getUserCheckpoints(userId, _rmDF = _rmDF))\
                                             .max()[-2:])

    # time spent on each chapter
    completionTimes = getCheckpointsCompletionTimesUser(userId, _rmDF = _rmDF)

    completionTime = 0
    checkpointCompletionTime = pd.Series()
    for checkpoint in timedSectionsIndex:
        deltaTime = completionTimes.loc[checkpoint,"firstCompletionDuration"].total_seconds()
        checkpointCompletionTime.loc["ch" + (checkpoint[-2:]) + "completion"] = deltaTime
        completionTime += deltaTime

    # efficiency = (1 + #unlockedchapters)/(time * (1 + #death + #craft + #add + #equip))
    data.loc['efficiency', columnName] = np.log(( 1 + data.loc['maxChapter', columnName] ) / \
                                        (completionTime \
                                         * ( 1\
                                            + data.loc['death', columnName] \
                                            + data.loc['craft', columnName]\
                                            + data.loc['add', columnName]\
                                            + data.loc['equip', columnName]\
                                           )\
                                        ))

    playedTime = getPlayedTimeUser(userId, _rmDF = _rmDF)
    
    data.loc['thoroughness', columnName] = \
    data.loc['craft', columnName]\
    * data.loc['pickup', columnName]\
    * ( 1 + np.power(len(playedTime['sandbox']['daysSpent']),2))

    totalSpentTime = playedTime['tutorial']['totalSpentTime'] + playedTime['sandbox']['totalSpentTime']
    totalSpentDays = len(playedTime['tutorial']['daysSpent'] | playedTime['sandbox']['daysSpent'])
    data.loc['fun', columnName] = np.log(\
                                    max(1,\
                                        totalSpentTime.total_seconds()
                                        * np.power(totalSpentDays,2)
                                       ))

    data.loc['completionTime', columnName] = completionTime
    for time in checkpointCompletionTime.index:
        data.loc[time,columnName] = checkpointCompletionTime.loc[time]
    
    totalTimes = getCheckpointsTotalTimesUser(userId, _rmDF = _rmDF)
    for checkpoint in timedSectionsIndex:
        data.loc["ch" + (checkpoint[-2:]) + "total",columnName] = totalTimes[checkpoint].total_seconds()
    data.loc["totalTime",columnName] = totalTimes.sum().total_seconds()

    emptyAnswer = _gfDF.iloc[0].copy()
    emptyAnswer[:] = np.nan
    
    if(len(_source) != 0):
        if hasAnswered(userId, _gfDF):
            gformLine = _gfDF[_gfDF[localplayerguidkey] == userId]
            gformLinePretest = gformLine.iloc[0]
            gformLinePosttest = gformLine.iloc[0]
            
            pretests = gformLine[gformLine[QTemporality] == answerTemporalities[0]]
            posttests = gformLine[gformLine[QTemporality] == answerTemporalities[1]]
            undefined = gformLine[gformLine[QTemporality] == answerTemporalities[2]]
            
            if (len(pretests) > 0) & (len(posttests) > 0):
                # take last pretest and first posttest
                # TODO add date/time checks
                gformLinePretest = pretests.iloc[-1]
                gformLinePosttest = posttests.iloc[0]
                    
            elif (len(posttests) > 0):
                if _printDebug:
                    print("warning: no pretest for u="+userId)
                gformLinePretest = emptyAnswer
                gformLinePosttest = posttests.iloc[0]
                    
            elif (len(pretests) > 0):
                if _printDebug:
                    print("warning: no posttest for u="+userId)
                gformLinePosttest = emptyAnswer
                    
            else:
                if _printDebug:
                    print("warning: only undefined survey answers for u="+userId)
                gformLinePretest = emptyAnswer
                gformLinePosttest = emptyAnswer
            
            # add data from the gform: binary/numeric score on each question
            gformDataPretest = []
            gformDataPosttest = []
            if _binary:
                gformDataPretest = getBinarized(gformLinePretest, _source = _source)
                gformDataPosttest = getBinarized(gformLinePosttest, _source = _source)
            else:
                gformDataPretest = getNumeric(gformLinePretest, _source = _source)
                gformDataPosttest = getNumeric(gformLinePosttest, _source = _source)
            gformDataDelta = gformDataPosttest - gformDataPretest

            for question in gformDataPretest.index:
                data.loc[answerTemporalities[0] + " " + question,columnName] = gformDataPretest.loc[question]
            for question in gformDataPretest.index:
                data.loc[answerTemporalities[1] + " " + question,columnName] = gformDataPosttest.loc[question]
            for question in gformDataPretest.index:
                data.loc["delta "                     + question,columnName] = gformDataDelta.loc[question]
        else:
            if _printDebug:
                print("warning: user " + userId + " has never answered the survey")
        
    return data

In [ ]:
# for per-session, manual analysis
def getSessionDataPreview( _sessionId, _rmDF):
    _logs = _rmDF[_rmDF['sessionId'] == _sessionId]

    _timedEvents = _logs['userTime']
    _timedEvents = _timedEvents.sort_values()
    _platform = _logs['customData.platform'].dropna().values
    if(len(_platform) > 0):
        _platform = _platform[0]
    else:
        _platform = ''
    _events = _logs['type'].value_counts()
    return {
        'first' : _timedEvents.iloc[0],
        'last' : _timedEvents.iloc[-1],
        'platform' : _platform,
        'events' : _events
    }

In [ ]:
# for per-user, manual analysis
def getUserDataPreview(userId, _rmDF, _gfDF):
#    [ ] RM
#      [ ] sessions count
#      [ ] first event date
#      [ ] time played
#      [ ] dates played
#      [ ] first played, last played
#      [ ] best chapter
#      [ ] counts of events: deaths, crafts,...
#      [ ] gaming platform
#    [ ] GF
#      [ ] score(s)
#        [ ] progression
#      [ ] temporality
#        [ ] temporality according to answers
#        [ ] #before
#        [ ] #after
#      [ ] demographics

    result = pd.DataFrame(
        columns = [userId]
    )

    #    [ ] RM
    result.loc['REDMETRICS ANALYSIS'] = ' '
    #      [ ] sessions count
    sessions = getUserSessions(_rmDF, userId)
    result.loc['sessions', userId] = len(sessions)
    #      [ ] first event date
    result.loc['firstEvent', userId] = getFirstEventDate( userId )
    #      [ ] time played
    #      [ ] dates played
    #      [ ] first played, last played
    sessionIds = sessions['sessionId']
    for _sessionIdIndex in range(0, len(sessions['sessionId'])):
        _sessionId = sessionIds.iloc[_sessionIdIndex]
        sdp = getSessionDataPreview(_sessionId, _rmDF = _rmDF)

        result.loc['session' + str(_sessionIdIndex) + ' platform',userId] = sdp['platform']
        result.loc['session' + str(_sessionIdIndex) + ' first',userId] = sdp['first']
        result.loc['session' + str(_sessionIdIndex) + ' last',userId] = sdp['last']
        result.loc['session' + str(_sessionIdIndex) + ' events',userId] = str(sdp['events'])
    #      [ ] best chapter
    #      [ ] counts of events: deaths, crafts,...

    #    [ ] GF
    result.loc['GFORM ANALYSIS'] = ' '
    #      [ ] score(s)
    score = getScore(userId, _gfDF)
    for _temporality in score.columns:
        _score = score.loc[scoreLabel,_temporality]
        if(len(_score)>0):
            if(_temporality == answerTemporalities[0]):
                _score = _score[len(_score)-1]
            else:
                _score = _score[0]
        else:
            _score = np.nan
        result.loc[scoreLabel+_temporality,userId] = _score
    #        [ ] progression
    #      [ ] demographics
    result.loc[scoreLabel+'s',userId] = str(score.values)

    gfDataPreview = getGFormDataPreview(userId, _gfDF)
    features = {1: 'date', 2: 'temporality RM', 3: 'temporality GF', 4: 'score', 5: 'genderAge'}
    for key in gfDataPreview:
        for featureKey in features:
            result.loc[key + ' ' + features[featureKey]] = str(gfDataPreview[key][features[featureKey]])
        index = 0
        for match in gfDataPreview[key]['demographic matches']:
            result.loc[key + ' demographic match ' + str(index)] = repr(match)
            index += 1

    return result

Finding the quickest player who completed the game


In [ ]:
def getRecordPlayer(rmdf, gfdf):
    
    newownrecords = rmdf[rmdf['type'] == 'newownrecord']
    recordDuration = pd.Timedelta.max
    recordSessionId = ''

    sessions = newownrecords[newownrecords['customData.chapter'] == '"10"']['sessionId']
    #print("#sessions="+str(len(sessions)))

    for sessionId in sessions:    
        #print(".", end="", flush=True)

        #print(str((len(sessionRecords['customData.chapter'].unique()) > 0)))
        sessionRecords = newownrecords[newownrecords['sessionId'] == sessionId]
        #print("#sessionRecords="+str(len(sessionRecords)))

        chaptersRows = sessionRecords[sessionRecords['customData.chapter'].isin(chapterArrayStr)]
        #print("#chaptersRows="+str(len(chaptersRows)))

        chaptersRowsChapters = sorted(chaptersRows['customData.chapter'].unique())    
        #print("#chaptersRowsChapters="+str(len(chaptersRowsChapters)))

        hasChapters = (len(chaptersRowsChapters) > 0)
        hasAllChapters = False
        if hasChapters:
            hasAllChapters = (chaptersRowsChapters == chapterArrayStr)
        #print("hasChapters="+str(hasChapters)+", hasAllChapters="+str(hasAllChapters))
        #print(str(chaptersRowsChapters))

        if hasAllChapters:
            #print("complete")
            duration = pd.Timedelta(seconds=sum([int(t.replace('"', '')) for t in sessionRecords['customData.duration'].values]))
            if duration < recordDuration:
                recordDuration = duration
                recordSessionId = sessionId

    recordTime = rmdf[rmdf['sessionId']==recordSessionId]['userTime'].max().tz_convert('Europe/Berlin')
    recordUserId = rmdf[rmdf['sessionId']==recordSessionId]['userId'].unique()[0]
    recordPlatform = rmdf[rmdf['userId']==recordUserId]['customData.platform'].dropna().iloc[0]            
    

    recordAnswers = gfdf[gfdf[QUserId] == recordUserId]
    recordGender = recordAnswers.iloc[0][QGender]
    recordAge = recordAnswers.iloc[0][QAge]
    recordLanguage = recordAnswers.iloc[0][QLanguage]

    return recordDuration, recordTime, recordAge, recordGender, recordLanguage, recordPlatform, recordUserId, recordSessionId

In [ ]:
# shortest NaN chapter newownrecord: shows that NaN doesn't mean "whole completion time"

newownrecords = rmdf1522[rmdf1522['type'] == 'newownrecord']
nanChapternewownrecords = newownrecords[pd.isnull(newownrecords['customData.chapter'])]

#section?
recordDuration = pd.Timedelta.max
recordSessionId = ''
for sessionId in newownrecords[pd.isnull(newownrecords['customData.chapter'])]['sessionId']:
    #print(".", end="", flush=True)
    #print(str((len(sessionRecords['customData.chapter'].unique()) > 0)))
    sessionRecords = newownrecords[newownrecords['sessionId'] == sessionId]
    duration = pd.Timedelta(seconds=int(sessionRecords[pd.isnull(sessionRecords['customData.chapter'])]['customData.duration'].values[0].replace('"','')))
    if duration < recordDuration:
        recordDuration = duration
        recordSessionId = sessionId
recordDuration, recordSessionId